import lxml.etree as le

with open('edu.html','r',encoding='utf-8') as f:
    # 读取文件
    file = f.read()

    # 转换格式为XML对象
    html_xpath = le.HTML(file)
    print(html_xpath)

    # 爬取一些div标签
    div_xpath_list = html_xpath.xpath('//div[@class = "classify_cList"]')
    #print(div_xpath_list)

    parsed_data = []
    for div in div_xpath_list:
        # 得到一级分类
        category1 = div.xpath('./h3/a/text()')
        #print(category1)

        # 得到二级分类
        category2_list = div.xpath('./div/span/a/text()')
        #print(category2_list)

        # 遍历二级分类列表，测试爬取正确
        #for c in category2_list:
            # print(c)

        # 添加爬取的数据到储存对象
        parsed_data.append(dict(c1 = category1, c2_s = category2_list))

    # 遍历并打印储存对象
    for data in parsed_data:
        print(data.get('c1'), ':    ', data.get('c2_s'))